R packages I used during the workshop.
The dataset we will be using is the State Expenditures data set,
which you can load into your R session by passing the URL to
RSocrata::read.socrata().
This report provides information on expenditures (i.e., cash
transactions/payments) for the agencies that utilize the Statewide
Financial Management Application (SFMA) issued for the fiscal year 2024
(July 1, 2023 - June 30, 2024).
Viewing the data
We do not build an analysis around the data we have; we find the
data for the analysis that we need!
This chunk shows two ways we can load the data, one relies on an API
via RSocrata, whereas the second option relies on exporting the data
manually from Data.Oregon.Gov.
# Using RSocrata
state_expenditures <- read.socrata("https://data.oregon.gov/Revenue-Expense/Agency-Expenditures-Multi-Year-Report/y9g9-xsxs")
# Using csv file stored locally
#state_expenditures <- read.csv(here("data", "Agency_Expenditures_–_Multi-Year_Report_20250827.csv"), stringsAsFactors = FALSE)
glimpse(state_expenditures)
Rows: 567,728
Columns: 10
$ fiscal_year <int> 2023, 2023, 2023, 2023, 2023, 2023, 2023, 2023, 2023, 2023, 2023, 2023, 2023, 2023, 20…
$ agency <int> 919, 919, 919, 919, 919, 919, 919, 919, 919, 919, 919, 919, 919, 919, 919, 919, 919, 9…
$ agency_1 <chr> "REAL ESTATE AGY", "REAL ESTATE AGY", "REAL ESTATE AGY", "REAL ESTATE AGY", "REAL ESTA…
$ budget_class <int> 3110, 3240, 4100, 4100, 4100, 4100, 4100, 4100, 4100, 4100, 4100, 4100, 4100, 4100, 41…
$ budget_class_1 <chr> "CLASS/UNCLASS SALARY & PER DIEM", "UNEMPLOYMENT ASSESSMENT", "INSTATE TRAVEL", "INSTA…
$ expend_class <int> 3111, 3231, 4101, 4101, 4101, 4101, 4101, 4104, 4105, 4106, 4106, 4108, 4108, 4108, 41…
$ expend_class_1 <chr> "REGULAR EMPLOYEES", "UNEMPLOYMENT COMPENSATION & ASSESSMENT", "INSTATE MEALS WITH OVE…
$ vendor <chr> "DEPARTMENT OF ADMINISTRATIVE SERVICES", "EMPLOYMENT DEPARTMENT", "FRANK LEONARD JR", …
$ expense <dbl> 270.72, 5131.00, 59.00, 73.75, 88.50, 295.00, 458.13, 120.00, 414.01, 898.28, 2843.22,…
$ vendor_st <chr> "OR", "OR", "", "", "", "", "", "MO", "MO", "", "MO", "", "", "", "", "", "OR", "MO", …
Data quality checks
We’ll clean-up some of the types that should be treated as
characters. We’ll also rename variables for improved readability.
# basic clean-up
state_expenditures_clean <-
state_expenditures %>%
mutate(across(c(agency , expend_class, budget_class), ~as.character(.))) %>%
rename(
"agency_code" = agency,
"agency_name" = agency_1,
"budget_class_code" = budget_class,
"budget_class_name" = budget_class_1,
"expend_class_code" = expend_class,
"expend_class_name" = expend_class_1
)
glimpse(state_expenditures_clean)
Rows: 567,728
Columns: 10
$ fiscal_year <int> 2023, 2023, 2023, 2023, 2023, 2023, 2023, 2023, 2023, 2023, 2023, 2023, 2023, 2023,…
$ agency_code <chr> "919", "919", "919", "919", "919", "919", "919", "919", "919", "919", "919", "919",…
$ agency_name <chr> "REAL ESTATE AGY", "REAL ESTATE AGY", "REAL ESTATE AGY", "REAL ESTATE AGY", "REAL E…
$ budget_class_code <chr> "3110", "3240", "4100", "4100", "4100", "4100", "4100", "4100", "4100", "4100", "41…
$ budget_class_name <chr> "CLASS/UNCLASS SALARY & PER DIEM", "UNEMPLOYMENT ASSESSMENT", "INSTATE TRAVEL", "IN…
$ expend_class_code <chr> "3111", "3231", "4101", "4101", "4101", "4101", "4101", "4104", "4105", "4106", "41…
$ expend_class_name <chr> "REGULAR EMPLOYEES", "UNEMPLOYMENT COMPENSATION & ASSESSMENT", "INSTATE MEALS WITH …
$ vendor <chr> "DEPARTMENT OF ADMINISTRATIVE SERVICES", "EMPLOYMENT DEPARTMENT", "FRANK LEONARD JR…
$ expense <dbl> 270.72, 5131.00, 59.00, 73.75, 88.50, 295.00, 458.13, 120.00, 414.01, 898.28, 2843.…
$ vendor_st <chr> "OR", "OR", "", "", "", "", "", "MO", "MO", "", "MO", "", "", "", "", "", "OR", "MO…
We can use inspectdf to view the unique counts, and most
common values for each of the categorical variables.
# unique counts by category
state_expenditures_clean %>%
inspect_cat()
Column (2/8): agency_name
Column (3/8): budget_class_code
Column (4/8): budget_class_name
Column (5/8): expend_class_code
Column (6/8): expend_class_name
Column (7/8): vendor
Column (8/8): vendor_st
We can see that there are discrepancies between the agency, budget,
and expenditure class codes since the total counts for these are
different. Let’s take a look at those.
# duplicates for agency codes
state_expenditures_clean %>%
distinct(agency_code, agency_name) %>%
count(agency_code, sort = TRUE) %>%
filter(n > 1)
# duplicates for budget class codes
state_expenditures_clean %>%
distinct(budget_class_code, budget_class_name) %>%
count(budget_class_code, sort = TRUE) %>%
filter(n > 1)
# duplicates for expenditure class codes
state_expenditures_clean %>%
distinct(expend_class_code, expend_class_name) %>%
count(expend_class_code, sort = TRUE) %>%
filter(n > 1)
What states are included for vendors?
# show all states included
state_expenditures_clean %>%
count(vendor_st, sort = TRUE)
Interestingly, this extends beyond US-based vendors with 67 rows
which includes blanks, ‘XX’, and several Canadian provinces.
At this point we may decide that the data quality issues require
engagement with the data owners of this asset to proceed with our
analysis.
Or we may be comfortable with proceeding!
Exploratory Analysis
Viewing total expenditures by year
state_expenditures_clean %>%
group_by(fiscal_year) %>%
reframe(total_expend_in_billions = sum(expense)/1e9)
state_expenditures_clean %>%
group_by(fiscal_year) %>%
reframe(total_expend_in_billions = sum(expense)/1e9) %>%
ggplot(aes(x = as.factor(fiscal_year), y = total_expend_in_billions)) +
geom_col() +
scale_y_continuous(labels = scales::label_currency()) +
theme_classic() +
labs(
y = "Total Expenditures ($ Billions)",
x = NULL,
caption = "\nData source:\n \'Agency Expenditures - Multi-Year Report\'\n As of December 17, 2024 from Data.Oregon.Gov"
)

How about viewing by agencies with the top 5 highest expenditures in
2024?
top_agencies <-
state_expenditures_clean %>%
filter(fiscal_year == 2024) %>%
group_by(agency_name) %>%
reframe(total_expenditures = sum(expense)) %>%
arrange(-total_expenditures) %>%
slice(1:5) %>%
pull(agency_name)
summary_by_top_agencies <-
state_expenditures_clean %>%
filter(agency_name %in% top_agencies) %>%
group_by(fiscal_year, agency_name) %>%
reframe(total_expend_in_billions = sum(expense)/1e9) %>%
mutate(
agency_name = fct_reorder(agency_name, total_expend_in_billions, .desc = TRUE)
)
summary_by_top_agencies %>%
ggplot(aes(x = as.factor(fiscal_year), y = total_expend_in_billions, group = agency_name)) +
geom_col() +
facet_wrap(~ agency_name, ncol = 3, scales = "free_y", axes = "margins") +
scale_y_continuous(labels = scales::label_currency()) +
theme_classic() +
labs(
y = "Total expenditures ($ Billions)",
x = NULL,
caption = "\nData source:\n \'Agency Expenditures - Multi-Year Report\'\n As of December 17, 2024 from Data.Oregon.Gov",
title = "Top 5 Agencies with the Highest Expenditures in 2024",
subtitle = "Axes for total expenditures varies by agency to show differences in scale."
)

8.27.2025
Amelia L. Vargas
---
title: "SORA Inclusive Analytics Workshop"
output: html_notebook
---

R packages I used during the workshop. 

```{r setup}
knitr::opts_chunk$set(echo = TRUE)
library(here)
library(dplyr)
library(RSocrata)
library(ggplot2)
library(inspectdf)
library(forcats)
```

The dataset we will be using is the State Expenditures data set, which you can load into your R session by passing the URL to `RSocrata::read.socrata()`. 

About the data: 
<https://data.oregon.gov/Revenue-Expense/Agency-Expenditures-Multi-Year-Report/y9g9-xsxs/about_data>

This report provides information on expenditures (i.e., cash transactions/payments) for 
the agencies that utilize the Statewide Financial Management Application (SFMA) issued 
for the fiscal year 2024 (July 1, 2023 - June 30, 2024). 

# Purpose

Before defining our purpose we should consider... 

## Who might be interested in an analysis of expenditures? 

* Public employees  
* Agencies/agency heads  
* Public/taxpayers  
* Legislators  
* Vendors/Suppliers  
* Auditors  
* Analysts


## What are the benefits?

* Process improvements (internal and external)  
* Viewing outliers (identifying too small or too large of expenditures) 
* Trends, e.g. seeing the general pattern - what are the highest/lowest expenditures, what are the most common expenditures, etc.
* Compliance, to ensure that we are following our procurement rules
* Opportunities for consolidating, getting better contracts
* Better understanding of where taxes go

## What are the risks? 

* Legislators may see the analysis and think that costs are too high and use it as rationale to cut budgets  
* Patterns may be revealed that harm reputations (institutional, individual, etc)

Purpose: 
Make the expenditures data set more accessible to those interested in the analysis by providing

Expense trends  

* Overall expenditures by year  
* Expense by year by agency  
* Expense by category by agency  
* Expense by category by vendor  

Deep dive into outliers  

* High & Low expenditures  
* expenditures out of compliance with procurement statues/rules/guidance

# Viewing the data  

*We do not build an analysis around the data we have; we find the data for the analysis that we need!*

This chunk shows two ways we can load the data, one relies on an API via RSocrata, whereas the second option relies on exporting the data manually from 
Data.Oregon.Gov.

```{r load-expenditure-data, echo=TRUE}
# Using RSocrata
state_expenditures <- read.socrata("https://data.oregon.gov/Revenue-Expense/Agency-Expenditures-Multi-Year-Report/y9g9-xsxs")

# Using csv file stored locally
#state_expenditures <- read.csv(here("data", "Agency_Expenditures_–_Multi-Year_Report_20250827.csv"), stringsAsFactors = FALSE)

glimpse(state_expenditures)
```
## Data quality checks

We'll clean-up some of the types that should be treated as characters. We'll also rename variables for improved readability.

```{r echo=TRUE}
# basic clean-up
state_expenditures_clean <- 
  state_expenditures %>% 
  mutate(across(c(agency , expend_class, budget_class), ~as.character(.))) %>% 
  rename(
    "agency_code" = agency,
    "agency_name" = agency_1,
    "budget_class_code" = budget_class,
    "budget_class_name" = budget_class_1,
    "expend_class_code" = expend_class,
    "expend_class_name" = expend_class_1
  )

glimpse(state_expenditures_clean)
```
We can use `inspectdf` to view the unique counts, and most common values for each of the categorical variables. 

```{r echo=TRUE}
# unique counts by category
state_expenditures_clean %>% 
  inspect_cat()
```

We can see that there are discrepancies between the agency, budget, and expenditure class codes since the total counts for these are different. 
Let's take a look at those. 

```{r identify-duplicates, echo=TRUE}
# duplicates for agency codes
state_expenditures_clean %>% 
  distinct(agency_code, agency_name) %>% 
  count(agency_code, sort = TRUE) %>% 
  filter(n > 1)

# duplicates for budget class codes
state_expenditures_clean %>% 
  distinct(budget_class_code, budget_class_name) %>% 
  count(budget_class_code, sort = TRUE) %>% 
  filter(n > 1)

# duplicates for expenditure class codes
state_expenditures_clean %>% 
  distinct(expend_class_code, expend_class_name) %>% 
  count(expend_class_code, sort = TRUE) %>% 
  filter(n > 1)
```

What states are included for vendors?

```{r echo=TRUE}
# show all states included
state_expenditures_clean %>% 
  count(vendor_st, sort = TRUE)
```
Interestingly, this extends beyond US-based vendors with 67 rows which includes blanks, 'XX', and several Canadian provinces.

At this point we may decide that the data quality issues require engagement with the data owners of this asset to proceed with our analysis. 

Or we may be comfortable with proceeding!

# Exploratory Analysis

Viewing total expenditures by year

```{r echo=TRUE}
state_expenditures_clean %>% 
  group_by(fiscal_year) %>% 
  reframe(total_expend_in_billions = sum(expense)/1e9)
```

```{r echo=TRUE}
state_expenditures_clean %>% 
  group_by(fiscal_year) %>% 
  reframe(total_expend_in_billions = sum(expense)/1e9) %>% 
  ggplot(aes(x = as.factor(fiscal_year), y = total_expend_in_billions)) + 
  geom_col() +
  scale_y_continuous(labels = scales::label_currency()) +
  theme_classic() + 
  labs(
    y = "Total Expenditures ($ Billions)", 
    x = NULL,
    caption = "\nData source:\n \'Agency Expenditures - Multi-Year Report\'\n As of December 17, 2024 from Data.Oregon.Gov"
  )
```


How about viewing by agencies with the top 5 highest expenditures in 2024? 

```{r echo=TRUE}
top_agencies <- 
  state_expenditures_clean %>% 
  filter(fiscal_year == 2024) %>% 
  group_by(agency_name) %>% 
  reframe(total_expenditures = sum(expense)) %>% 
  arrange(-total_expenditures) %>% 
  slice(1:5) %>% 
  pull(agency_name)

summary_by_top_agencies <- 
  state_expenditures_clean %>% 
  filter(agency_name %in% top_agencies) %>% 
  group_by(fiscal_year, agency_name) %>% 
  reframe(total_expend_in_billions = sum(expense)/1e9) %>% 
  mutate(
    agency_name = fct_reorder(agency_name, total_expend_in_billions, .desc = TRUE)
  )
  
summary_by_top_agencies  %>% 
  ggplot(aes(x = as.factor(fiscal_year), y = total_expend_in_billions, group = agency_name)) + 
  geom_col() +
  facet_wrap(~ agency_name, ncol = 3, scales = "free_y", axes = "margins") + 
  scale_y_continuous(labels = scales::label_currency()) +
  theme_classic() + 
  labs(
    y = "Total expenditures ($ Billions)", 
    x = NULL,
    caption = "\nData source:\n \'Agency Expenditures - Multi-Year Report\'\n As of December 17, 2024 from Data.Oregon.Gov",
    title = "Top 5 Agencies with the Highest Expenditures in 2024",
    subtitle = "Axes for total expenditures varies by agency to show differences in scale."
  )
```

8.27.2025  
Amelia L. Vargas